TL;DR

Applying our zero-inflated model to the olfactory data.

Unnormalized data

We select only the cells that pass QC, color coded by the experimental time point. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide’s computer.

load("Expt4c2b_filtdata.Rda")
load("E4c2b_slingshot_wsforkelly.RData")

To speed up the computations, I will focus on the top 1,000 most variable genes.

names(batch) <- colnames(counts)

counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]

vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]

PCA

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")

fq <- EDASeq::betweenLaneNormalization(counts, which="full")

pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")

The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.

They found that to fully explain the differences between clusters, we need three dimensions.

pairs(pcafq$x[,1:3], col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")

df <- data.frame(PC1=pcafq$x[,1], PC2=pcafq$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                        PC1        PC2 detection_rate    coverage
## PC1             1.00000000 -0.0432585      0.2643194 -0.02214266
## PC2            -0.04325850  1.0000000      0.6508453  0.19979644
## detection_rate  0.26431940  0.6508453      1.0000000  0.41994811
## coverage       -0.02214266  0.1997964      0.4199481  1.00000000

Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate.

library(scone)
library(rARPACK)
fastpca <- function(expr, scale=FALSE) {
  svd_raw <- svds(scale(t(expr), center=TRUE, scale=scale), k=3, nu=3, nv=0)
  pc_raw <- svd_raw$u %*% diag(svd_raw$d[1:3])
  return(pc_raw)
}

raw <- as.matrix(counts)
totalcount = function (ei)
{
  sums = colSums(ei)
  eo = t(t(ei)*mean(sums)/sums)
  return(eo)
}

tc <- totalcount(raw)
fq <- FQT_FN(raw)
tmm <- TMM_FN(raw)

vargenes <- rownames(core)

pc_raw <- fastpca(log1p(raw[vargenes,]))
pc_tc <- fastpca(log1p(tc[vargenes,]))
pc_fq <- fastpca(log1p(fq[vargenes,]))
pc_tmm <- fastpca(log1p(tmm[vargenes,]))
wrapRzifa <- function(Y, block = TRUE, k=2){
  # wrapper R function for ZIFA.
  # md5 hashing and temporary files are used not to re-run zifa 
  # if it has already be run on this computer.
  d = digest(Y, "md5")
  tmp = paste0(tempdir(), '/', d)
  write.csv(Y, paste0(tmp, '.csv'))
  
  if (!file.exists(paste0(tmp, "_", k, '_zifa.csv'))){
    print('run ZIFA')
    bb = ifelse(block, '-b ', '')
    cmd = sprintf('python run_zifa.py -d %d %s%s.csv %s_%d_zifa.csv', k, bb, tmp, tmp, k)
    system(cmd)
  }
  read.csv(sprintf("%s_%d_zifa.csv", tmp, k), header=FALSE)
}

zifa_raw <- wrapRzifa(log1p(raw[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tc <- wrapRzifa(log1p(tc[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tmm <- wrapRzifa(log1p(tmm[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_fq <- wrapRzifa(log1p(fq[vargenes,]), k=3)
## [1] "run ZIFA"

ZINB

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 3)))
##    user  system elapsed 
## 130.370  21.678  63.712
pairs(zinb_Vall@W, col=colMerged, pch=19, main="ZINB")

qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)

df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], W3=zinb_Vall@W[,3], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage, quality=qcpca$x[,1])
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1          W2           W3    gamma_mu
## W1              1.00000000 -0.16683191 -0.107484692  0.03024469
## W2             -0.16683191  1.00000000  0.056838063  0.01042277
## W3             -0.10748469  0.05683806  1.000000000 -0.17160625
## gamma_mu        0.03024469  0.01042277 -0.171606253  1.00000000
## gamma_pi        0.25935897  0.07017869 -0.014958757 -0.09570741
## detection_rate -0.33704112 -0.15008428 -0.027467546  0.22824345
## coverage        0.07794123 -0.04513153  0.007492650  0.90197020
## quality         0.11001325  0.09386843 -0.003560763 -0.55770491
##                   gamma_pi detection_rate    coverage      quality
## W1              0.25935897    -0.33704112  0.07794123  0.110013250
## W2              0.07017869    -0.15008428 -0.04513153  0.093868427
## W3             -0.01495876    -0.02746755  0.00749265 -0.003560763
## gamma_mu       -0.09570741     0.22824345  0.90197020 -0.557704907
## gamma_pi        1.00000000    -0.96579839 -0.31884978  0.299364588
## detection_rate -0.96579839     1.00000000  0.41994811 -0.347477558
## coverage       -0.31884978     0.41994811  1.00000000 -0.609447350
## quality         0.29936459    -0.34747756 -0.60944735  1.000000000

Accounting for quality

mod <- model.matrix(~qcpca$x[,1:5])
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
##    user  system elapsed 
## 206.298  30.046  92.629
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1           W2           W3    gamma_mu
## W1              1.00000000  0.018099164  0.131838743  0.09537145
## W2              0.01809916  1.000000000  0.001966816  0.11518619
## W3              0.13183874  0.001966816  1.000000000 -0.03705229
## gamma_mu        0.09537145  0.115186186 -0.037052286  1.00000000
## gamma_pi       -0.07415694  0.056694880 -0.130417497 -0.28456970
## detection_rate  0.14377846 -0.126369965  0.153647636  0.48846297
## coverage        0.02138167  0.124259640  0.106557525  0.87012468
##                   gamma_pi detection_rate    coverage
## W1             -0.07415694      0.1437785  0.02138167
## W2              0.05669488     -0.1263700  0.12425964
## W3             -0.13041750      0.1536476  0.10655753
## gamma_mu       -0.28456970      0.4884630  0.87012468
## gamma_pi        1.00000000     -0.9406926 -0.25076070
## detection_rate -0.94069259      1.0000000  0.41994811
## coverage       -0.25076070      0.4199481  1.00000000

Four dimensions

print(system.time(zinb_4 <- zinbFit(core, ncores = 3, K = 4)))
##    user  system elapsed 
## 151.480  25.479  71.973
pairs(zinb_4@W, col=colMerged, pch=19, main="ZINB")

save(zinb_3, zinb_4, zinb_Vall, pc_tmm, pc_fq, pc_tc, pc_raw, zifa_fq, zifa_tmm, zifa_tc, zifa_raw, file="olfactory.rda")